// Copyright 2021-2022 The OpenSSL Project Authors. All Rights Reserved.
//
// Licensed under the Apache License 2.0 (the "License").  You may not use
// this file except in compliance with the License.  You can obtain a copy
// in the file LICENSE in the source distribution or at
// https://www.openssl.org/source/license.html
//
// This module implements support for Armv8 SM3 instructions

// $output is the last argument if it looks like a file (it has an extension)
// $flavour is the first argument if it doesn't look like a file
#include "arm_arch.h"
.arch	armv8.2-a
.text
.globl	ossl_hwsm3_block_data_order
.type	ossl_hwsm3_block_data_order,%function
.align	5
ossl_hwsm3_block_data_order:
	AARCH64_VALID_CALL_TARGET
	// load state
	ld1	{v5.4s,v6.4s}, [x0]
	rev64	v5.4s, v5.4s
	rev64	v6.4s, v6.4s
	ext	v5.16b, v5.16b, v5.16b, #8
	ext	v6.16b, v6.16b, v6.16b, #8

	adr	x8, .Tj
	ldp	s16, s17, [x8]

.Loop:
	// load input
	ld1	{v0.16b,v1.16b,v2.16b,v3.16b}, [x1], #64
	sub	w2, w2, #1

	mov	v18.16b, v5.16b
	mov	v19.16b, v6.16b

#ifndef __ARMEB__
	rev32	v0.16b, v0.16b
	rev32	v1.16b, v1.16b
	rev32	v2.16b, v2.16b
	rev32	v3.16b, v3.16b
#endif

	ext	v20.16b, v16.16b, v16.16b, #4
	// s4 = w7  | w8  | w9  | w10
	ext	v4.16b, v1.16b, v2.16b, #12
	// vtmp1 = w3  | w4  | w5  | w6
	ext	v22.16b, v0.16b, v1.16b, #12
	// vtmp2 = w10 | w11 | w12 | w13
	ext	v23.16b, v2.16b, v3.16b, #8
.inst	0xce63c004	//sm3partw1 v4.4s, v0.4s, v3.4s
.inst	0xce76c6e4	//sm3partw2 v4.4s, v23.4s, v22.4s
	eor	v22.16b, v0.16b, v1.16b
.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
	shl	v21.4s, v20.4s, #1
	sri	v21.4s, v20.4s, #31
.inst	0xce5682e5	//sm3tt1a v5.4s, v23.4s, v22.4s[0]
.inst	0xce408ae6	//sm3tt2a v6.4s, v23.4s, v0.4s[0]
.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
	shl	v20.4s, v21.4s, #1
	sri	v20.4s, v21.4s, #31
.inst	0xce5692e5	//sm3tt1a v5.4s, v23.4s, v22.4s[1]
.inst	0xce409ae6	//sm3tt2a v6.4s, v23.4s, v0.4s[1]
.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
	shl	v21.4s, v20.4s, #1
	sri	v21.4s, v20.4s, #31
.inst	0xce56a2e5	//sm3tt1a v5.4s, v23.4s, v22.4s[2]
.inst	0xce40aae6	//sm3tt2a v6.4s, v23.4s, v0.4s[2]
.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
	shl	v20.4s, v21.4s, #1
	sri	v20.4s, v21.4s, #31
.inst	0xce56b2e5	//sm3tt1a v5.4s, v23.4s, v22.4s[3]
.inst	0xce40bae6	//sm3tt2a v6.4s, v23.4s, v0.4s[3]
	// s4 = w7  | w8  | w9  | w10
	ext	v0.16b, v2.16b, v3.16b, #12
	// vtmp1 = w3  | w4  | w5  | w6
	ext	v22.16b, v1.16b, v2.16b, #12
	// vtmp2 = w10 | w11 | w12 | w13
	ext	v23.16b, v3.16b, v4.16b, #8
.inst	0xce64c020	//sm3partw1 v0.4s, v1.4s, v4.4s
.inst	0xce76c6e0	//sm3partw2 v0.4s, v23.4s, v22.4s
	eor	v22.16b, v1.16b, v2.16b
.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
	shl	v21.4s, v20.4s, #1
	sri	v21.4s, v20.4s, #31
.inst	0xce5682e5	//sm3tt1a v5.4s, v23.4s, v22.4s[0]
.inst	0xce418ae6	//sm3tt2a v6.4s, v23.4s, v1.4s[0]
.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
	shl	v20.4s, v21.4s, #1
	sri	v20.4s, v21.4s, #31
.inst	0xce5692e5	//sm3tt1a v5.4s, v23.4s, v22.4s[1]
.inst	0xce419ae6	//sm3tt2a v6.4s, v23.4s, v1.4s[1]
.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
	shl	v21.4s, v20.4s, #1
	sri	v21.4s, v20.4s, #31
.inst	0xce56a2e5	//sm3tt1a v5.4s, v23.4s, v22.4s[2]
.inst	0xce41aae6	//sm3tt2a v6.4s, v23.4s, v1.4s[2]
.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
	shl	v20.4s, v21.4s, #1
	sri	v20.4s, v21.4s, #31
.inst	0xce56b2e5	//sm3tt1a v5.4s, v23.4s, v22.4s[3]
.inst	0xce41bae6	//sm3tt2a v6.4s, v23.4s, v1.4s[3]
	// s4 = w7  | w8  | w9  | w10
	ext	v1.16b, v3.16b, v4.16b, #12
	// vtmp1 = w3  | w4  | w5  | w6
	ext	v22.16b, v2.16b, v3.16b, #12
	// vtmp2 = w10 | w11 | w12 | w13
	ext	v23.16b, v4.16b, v0.16b, #8
.inst	0xce60c041	//sm3partw1 v1.4s, v2.4s, v0.4s
.inst	0xce76c6e1	//sm3partw2 v1.4s, v23.4s, v22.4s
	eor	v22.16b, v2.16b, v3.16b
.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
	shl	v21.4s, v20.4s, #1
	sri	v21.4s, v20.4s, #31
.inst	0xce5682e5	//sm3tt1a v5.4s, v23.4s, v22.4s[0]
.inst	0xce428ae6	//sm3tt2a v6.4s, v23.4s, v2.4s[0]
.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
	shl	v20.4s, v21.4s, #1
	sri	v20.4s, v21.4s, #31
.inst	0xce5692e5	//sm3tt1a v5.4s, v23.4s, v22.4s[1]
.inst	0xce429ae6	//sm3tt2a v6.4s, v23.4s, v2.4s[1]
.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
	shl	v21.4s, v20.4s, #1
	sri	v21.4s, v20.4s, #31
.inst	0xce56a2e5	//sm3tt1a v5.4s, v23.4s, v22.4s[2]
.inst	0xce42aae6	//sm3tt2a v6.4s, v23.4s, v2.4s[2]
.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
	shl	v20.4s, v21.4s, #1
	sri	v20.4s, v21.4s, #31
.inst	0xce56b2e5	//sm3tt1a v5.4s, v23.4s, v22.4s[3]
.inst	0xce42bae6	//sm3tt2a v6.4s, v23.4s, v2.4s[3]
	// s4 = w7  | w8  | w9  | w10
	ext	v2.16b, v4.16b, v0.16b, #12
	// vtmp1 = w3  | w4  | w5  | w6
	ext	v22.16b, v3.16b, v4.16b, #12
	// vtmp2 = w10 | w11 | w12 | w13
	ext	v23.16b, v0.16b, v1.16b, #8
.inst	0xce61c062	//sm3partw1 v2.4s, v3.4s, v1.4s
.inst	0xce76c6e2	//sm3partw2 v2.4s, v23.4s, v22.4s
	eor	v22.16b, v3.16b, v4.16b
.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
	shl	v21.4s, v20.4s, #1
	sri	v21.4s, v20.4s, #31
.inst	0xce5682e5	//sm3tt1a v5.4s, v23.4s, v22.4s[0]
.inst	0xce438ae6	//sm3tt2a v6.4s, v23.4s, v3.4s[0]
.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
	shl	v20.4s, v21.4s, #1
	sri	v20.4s, v21.4s, #31
.inst	0xce5692e5	//sm3tt1a v5.4s, v23.4s, v22.4s[1]
.inst	0xce439ae6	//sm3tt2a v6.4s, v23.4s, v3.4s[1]
.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
	shl	v21.4s, v20.4s, #1
	sri	v21.4s, v20.4s, #31
.inst	0xce56a2e5	//sm3tt1a v5.4s, v23.4s, v22.4s[2]
.inst	0xce43aae6	//sm3tt2a v6.4s, v23.4s, v3.4s[2]
.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
	shl	v20.4s, v21.4s, #1
	sri	v20.4s, v21.4s, #31
.inst	0xce56b2e5	//sm3tt1a v5.4s, v23.4s, v22.4s[3]
.inst	0xce43bae6	//sm3tt2a v6.4s, v23.4s, v3.4s[3]
	ext	v20.16b, v17.16b, v17.16b, #4
	// s4 = w7  | w8  | w9  | w10
	ext	v3.16b, v0.16b, v1.16b, #12
	// vtmp1 = w3  | w4  | w5  | w6
	ext	v22.16b, v4.16b, v0.16b, #12
	// vtmp2 = w10 | w11 | w12 | w13
	ext	v23.16b, v1.16b, v2.16b, #8
.inst	0xce62c083	//sm3partw1 v3.4s, v4.4s, v2.4s
.inst	0xce76c6e3	//sm3partw2 v3.4s, v23.4s, v22.4s
	eor	v22.16b, v4.16b, v0.16b
.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
	shl	v21.4s, v20.4s, #1
	sri	v21.4s, v20.4s, #31
.inst	0xce5686e5	//sm3tt1b v5.4s, v23.4s, v22.4s[0]
.inst	0xce448ee6	//sm3tt2b v6.4s, v23.4s, v4.4s[0]
.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
	shl	v20.4s, v21.4s, #1
	sri	v20.4s, v21.4s, #31
.inst	0xce5696e5	//sm3tt1b v5.4s, v23.4s, v22.4s[1]
.inst	0xce449ee6	//sm3tt2b v6.4s, v23.4s, v4.4s[1]
.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
	shl	v21.4s, v20.4s, #1
	sri	v21.4s, v20.4s, #31
.inst	0xce56a6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[2]
.inst	0xce44aee6	//sm3tt2b v6.4s, v23.4s, v4.4s[2]
.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
	shl	v20.4s, v21.4s, #1
	sri	v20.4s, v21.4s, #31
.inst	0xce56b6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[3]
.inst	0xce44bee6	//sm3tt2b v6.4s, v23.4s, v4.4s[3]
	// s4 = w7  | w8  | w9  | w10
	ext	v4.16b, v1.16b, v2.16b, #12
	// vtmp1 = w3  | w4  | w5  | w6
	ext	v22.16b, v0.16b, v1.16b, #12
	// vtmp2 = w10 | w11 | w12 | w13
	ext	v23.16b, v2.16b, v3.16b, #8
.inst	0xce63c004	//sm3partw1 v4.4s, v0.4s, v3.4s
.inst	0xce76c6e4	//sm3partw2 v4.4s, v23.4s, v22.4s
	eor	v22.16b, v0.16b, v1.16b
.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
	shl	v21.4s, v20.4s, #1
	sri	v21.4s, v20.4s, #31
.inst	0xce5686e5	//sm3tt1b v5.4s, v23.4s, v22.4s[0]
.inst	0xce408ee6	//sm3tt2b v6.4s, v23.4s, v0.4s[0]
.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
	shl	v20.4s, v21.4s, #1
	sri	v20.4s, v21.4s, #31
.inst	0xce5696e5	//sm3tt1b v5.4s, v23.4s, v22.4s[1]
.inst	0xce409ee6	//sm3tt2b v6.4s, v23.4s, v0.4s[1]
.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
	shl	v21.4s, v20.4s, #1
	sri	v21.4s, v20.4s, #31
.inst	0xce56a6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[2]
.inst	0xce40aee6	//sm3tt2b v6.4s, v23.4s, v0.4s[2]
.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
	shl	v20.4s, v21.4s, #1
	sri	v20.4s, v21.4s, #31
.inst	0xce56b6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[3]
.inst	0xce40bee6	//sm3tt2b v6.4s, v23.4s, v0.4s[3]
	// s4 = w7  | w8  | w9  | w10
	ext	v0.16b, v2.16b, v3.16b, #12
	// vtmp1 = w3  | w4  | w5  | w6
	ext	v22.16b, v1.16b, v2.16b, #12
	// vtmp2 = w10 | w11 | w12 | w13
	ext	v23.16b, v3.16b, v4.16b, #8
.inst	0xce64c020	//sm3partw1 v0.4s, v1.4s, v4.4s
.inst	0xce76c6e0	//sm3partw2 v0.4s, v23.4s, v22.4s
	eor	v22.16b, v1.16b, v2.16b
.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
	shl	v21.4s, v20.4s, #1
	sri	v21.4s, v20.4s, #31
.inst	0xce5686e5	//sm3tt1b v5.4s, v23.4s, v22.4s[0]
.inst	0xce418ee6	//sm3tt2b v6.4s, v23.4s, v1.4s[0]
.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
	shl	v20.4s, v21.4s, #1
	sri	v20.4s, v21.4s, #31
.inst	0xce5696e5	//sm3tt1b v5.4s, v23.4s, v22.4s[1]
.inst	0xce419ee6	//sm3tt2b v6.4s, v23.4s, v1.4s[1]
.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
	shl	v21.4s, v20.4s, #1
	sri	v21.4s, v20.4s, #31
.inst	0xce56a6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[2]
.inst	0xce41aee6	//sm3tt2b v6.4s, v23.4s, v1.4s[2]
.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
	shl	v20.4s, v21.4s, #1
	sri	v20.4s, v21.4s, #31
.inst	0xce56b6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[3]
.inst	0xce41bee6	//sm3tt2b v6.4s, v23.4s, v1.4s[3]
	// s4 = w7  | w8  | w9  | w10
	ext	v1.16b, v3.16b, v4.16b, #12
	// vtmp1 = w3  | w4  | w5  | w6
	ext	v22.16b, v2.16b, v3.16b, #12
	// vtmp2 = w10 | w11 | w12 | w13
	ext	v23.16b, v4.16b, v0.16b, #8
.inst	0xce60c041	//sm3partw1 v1.4s, v2.4s, v0.4s
.inst	0xce76c6e1	//sm3partw2 v1.4s, v23.4s, v22.4s
	eor	v22.16b, v2.16b, v3.16b
.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
	shl	v21.4s, v20.4s, #1
	sri	v21.4s, v20.4s, #31
.inst	0xce5686e5	//sm3tt1b v5.4s, v23.4s, v22.4s[0]
.inst	0xce428ee6	//sm3tt2b v6.4s, v23.4s, v2.4s[0]
.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
	shl	v20.4s, v21.4s, #1
	sri	v20.4s, v21.4s, #31
.inst	0xce5696e5	//sm3tt1b v5.4s, v23.4s, v22.4s[1]
.inst	0xce429ee6	//sm3tt2b v6.4s, v23.4s, v2.4s[1]
.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
	shl	v21.4s, v20.4s, #1
	sri	v21.4s, v20.4s, #31
.inst	0xce56a6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[2]
.inst	0xce42aee6	//sm3tt2b v6.4s, v23.4s, v2.4s[2]
.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
	shl	v20.4s, v21.4s, #1
	sri	v20.4s, v21.4s, #31
.inst	0xce56b6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[3]
.inst	0xce42bee6	//sm3tt2b v6.4s, v23.4s, v2.4s[3]
	// s4 = w7  | w8  | w9  | w10
	ext	v2.16b, v4.16b, v0.16b, #12
	// vtmp1 = w3  | w4  | w5  | w6
	ext	v22.16b, v3.16b, v4.16b, #12
	// vtmp2 = w10 | w11 | w12 | w13
	ext	v23.16b, v0.16b, v1.16b, #8
.inst	0xce61c062	//sm3partw1 v2.4s, v3.4s, v1.4s
.inst	0xce76c6e2	//sm3partw2 v2.4s, v23.4s, v22.4s
	eor	v22.16b, v3.16b, v4.16b
.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
	shl	v21.4s, v20.4s, #1
	sri	v21.4s, v20.4s, #31
.inst	0xce5686e5	//sm3tt1b v5.4s, v23.4s, v22.4s[0]
.inst	0xce438ee6	//sm3tt2b v6.4s, v23.4s, v3.4s[0]
.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
	shl	v20.4s, v21.4s, #1
	sri	v20.4s, v21.4s, #31
.inst	0xce5696e5	//sm3tt1b v5.4s, v23.4s, v22.4s[1]
.inst	0xce439ee6	//sm3tt2b v6.4s, v23.4s, v3.4s[1]
.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
	shl	v21.4s, v20.4s, #1
	sri	v21.4s, v20.4s, #31
.inst	0xce56a6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[2]
.inst	0xce43aee6	//sm3tt2b v6.4s, v23.4s, v3.4s[2]
.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
	shl	v20.4s, v21.4s, #1
	sri	v20.4s, v21.4s, #31
.inst	0xce56b6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[3]
.inst	0xce43bee6	//sm3tt2b v6.4s, v23.4s, v3.4s[3]
	// s4 = w7  | w8  | w9  | w10
	ext	v3.16b, v0.16b, v1.16b, #12
	// vtmp1 = w3  | w4  | w5  | w6
	ext	v22.16b, v4.16b, v0.16b, #12
	// vtmp2 = w10 | w11 | w12 | w13
	ext	v23.16b, v1.16b, v2.16b, #8
.inst	0xce62c083	//sm3partw1 v3.4s, v4.4s, v2.4s
.inst	0xce76c6e3	//sm3partw2 v3.4s, v23.4s, v22.4s
	eor	v22.16b, v4.16b, v0.16b
.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
	shl	v21.4s, v20.4s, #1
	sri	v21.4s, v20.4s, #31
.inst	0xce5686e5	//sm3tt1b v5.4s, v23.4s, v22.4s[0]
.inst	0xce448ee6	//sm3tt2b v6.4s, v23.4s, v4.4s[0]
.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
	shl	v20.4s, v21.4s, #1
	sri	v20.4s, v21.4s, #31
.inst	0xce5696e5	//sm3tt1b v5.4s, v23.4s, v22.4s[1]
.inst	0xce449ee6	//sm3tt2b v6.4s, v23.4s, v4.4s[1]
.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
	shl	v21.4s, v20.4s, #1
	sri	v21.4s, v20.4s, #31
.inst	0xce56a6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[2]
.inst	0xce44aee6	//sm3tt2b v6.4s, v23.4s, v4.4s[2]
.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
	shl	v20.4s, v21.4s, #1
	sri	v20.4s, v21.4s, #31
.inst	0xce56b6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[3]
.inst	0xce44bee6	//sm3tt2b v6.4s, v23.4s, v4.4s[3]
	// s4 = w7  | w8  | w9  | w10
	ext	v4.16b, v1.16b, v2.16b, #12
	// vtmp1 = w3  | w4  | w5  | w6
	ext	v22.16b, v0.16b, v1.16b, #12
	// vtmp2 = w10 | w11 | w12 | w13
	ext	v23.16b, v2.16b, v3.16b, #8
.inst	0xce63c004	//sm3partw1 v4.4s, v0.4s, v3.4s
.inst	0xce76c6e4	//sm3partw2 v4.4s, v23.4s, v22.4s
	eor	v22.16b, v0.16b, v1.16b
.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
	shl	v21.4s, v20.4s, #1
	sri	v21.4s, v20.4s, #31
.inst	0xce5686e5	//sm3tt1b v5.4s, v23.4s, v22.4s[0]
.inst	0xce408ee6	//sm3tt2b v6.4s, v23.4s, v0.4s[0]
.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
	shl	v20.4s, v21.4s, #1
	sri	v20.4s, v21.4s, #31
.inst	0xce5696e5	//sm3tt1b v5.4s, v23.4s, v22.4s[1]
.inst	0xce409ee6	//sm3tt2b v6.4s, v23.4s, v0.4s[1]
.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
	shl	v21.4s, v20.4s, #1
	sri	v21.4s, v20.4s, #31
.inst	0xce56a6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[2]
.inst	0xce40aee6	//sm3tt2b v6.4s, v23.4s, v0.4s[2]
.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
	shl	v20.4s, v21.4s, #1
	sri	v20.4s, v21.4s, #31
.inst	0xce56b6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[3]
.inst	0xce40bee6	//sm3tt2b v6.4s, v23.4s, v0.4s[3]
	// s4 = w7  | w8  | w9  | w10
	ext	v0.16b, v2.16b, v3.16b, #12
	// vtmp1 = w3  | w4  | w5  | w6
	ext	v22.16b, v1.16b, v2.16b, #12
	// vtmp2 = w10 | w11 | w12 | w13
	ext	v23.16b, v3.16b, v4.16b, #8
.inst	0xce64c020	//sm3partw1 v0.4s, v1.4s, v4.4s
.inst	0xce76c6e0	//sm3partw2 v0.4s, v23.4s, v22.4s
	eor	v22.16b, v1.16b, v2.16b
.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
	shl	v21.4s, v20.4s, #1
	sri	v21.4s, v20.4s, #31
.inst	0xce5686e5	//sm3tt1b v5.4s, v23.4s, v22.4s[0]
.inst	0xce418ee6	//sm3tt2b v6.4s, v23.4s, v1.4s[0]
.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
	shl	v20.4s, v21.4s, #1
	sri	v20.4s, v21.4s, #31
.inst	0xce5696e5	//sm3tt1b v5.4s, v23.4s, v22.4s[1]
.inst	0xce419ee6	//sm3tt2b v6.4s, v23.4s, v1.4s[1]
.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
	shl	v21.4s, v20.4s, #1
	sri	v21.4s, v20.4s, #31
.inst	0xce56a6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[2]
.inst	0xce41aee6	//sm3tt2b v6.4s, v23.4s, v1.4s[2]
.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
	shl	v20.4s, v21.4s, #1
	sri	v20.4s, v21.4s, #31
.inst	0xce56b6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[3]
.inst	0xce41bee6	//sm3tt2b v6.4s, v23.4s, v1.4s[3]
	// s4 = w7  | w8  | w9  | w10
	ext	v1.16b, v3.16b, v4.16b, #12
	// vtmp1 = w3  | w4  | w5  | w6
	ext	v22.16b, v2.16b, v3.16b, #12
	// vtmp2 = w10 | w11 | w12 | w13
	ext	v23.16b, v4.16b, v0.16b, #8
.inst	0xce60c041	//sm3partw1 v1.4s, v2.4s, v0.4s
.inst	0xce76c6e1	//sm3partw2 v1.4s, v23.4s, v22.4s
	eor	v22.16b, v2.16b, v3.16b
.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
	shl	v21.4s, v20.4s, #1
	sri	v21.4s, v20.4s, #31
.inst	0xce5686e5	//sm3tt1b v5.4s, v23.4s, v22.4s[0]
.inst	0xce428ee6	//sm3tt2b v6.4s, v23.4s, v2.4s[0]
.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
	shl	v20.4s, v21.4s, #1
	sri	v20.4s, v21.4s, #31
.inst	0xce5696e5	//sm3tt1b v5.4s, v23.4s, v22.4s[1]
.inst	0xce429ee6	//sm3tt2b v6.4s, v23.4s, v2.4s[1]
.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
	shl	v21.4s, v20.4s, #1
	sri	v21.4s, v20.4s, #31
.inst	0xce56a6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[2]
.inst	0xce42aee6	//sm3tt2b v6.4s, v23.4s, v2.4s[2]
.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
	shl	v20.4s, v21.4s, #1
	sri	v20.4s, v21.4s, #31
.inst	0xce56b6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[3]
.inst	0xce42bee6	//sm3tt2b v6.4s, v23.4s, v2.4s[3]
	eor	v22.16b, v3.16b, v4.16b
.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
	shl	v21.4s, v20.4s, #1
	sri	v21.4s, v20.4s, #31
.inst	0xce5686e5	//sm3tt1b v5.4s, v23.4s, v22.4s[0]
.inst	0xce438ee6	//sm3tt2b v6.4s, v23.4s, v3.4s[0]
.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
	shl	v20.4s, v21.4s, #1
	sri	v20.4s, v21.4s, #31
.inst	0xce5696e5	//sm3tt1b v5.4s, v23.4s, v22.4s[1]
.inst	0xce439ee6	//sm3tt2b v6.4s, v23.4s, v3.4s[1]
.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
	shl	v21.4s, v20.4s, #1
	sri	v21.4s, v20.4s, #31
.inst	0xce56a6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[2]
.inst	0xce43aee6	//sm3tt2b v6.4s, v23.4s, v3.4s[2]
.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
	shl	v20.4s, v21.4s, #1
	sri	v20.4s, v21.4s, #31
.inst	0xce56b6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[3]
.inst	0xce43bee6	//sm3tt2b v6.4s, v23.4s, v3.4s[3]
	eor	v22.16b, v4.16b, v0.16b
.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
	shl	v21.4s, v20.4s, #1
	sri	v21.4s, v20.4s, #31
.inst	0xce5686e5	//sm3tt1b v5.4s, v23.4s, v22.4s[0]
.inst	0xce448ee6	//sm3tt2b v6.4s, v23.4s, v4.4s[0]
.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
	shl	v20.4s, v21.4s, #1
	sri	v20.4s, v21.4s, #31
.inst	0xce5696e5	//sm3tt1b v5.4s, v23.4s, v22.4s[1]
.inst	0xce449ee6	//sm3tt2b v6.4s, v23.4s, v4.4s[1]
.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
	shl	v21.4s, v20.4s, #1
	sri	v21.4s, v20.4s, #31
.inst	0xce56a6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[2]
.inst	0xce44aee6	//sm3tt2b v6.4s, v23.4s, v4.4s[2]
.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
	shl	v20.4s, v21.4s, #1
	sri	v20.4s, v21.4s, #31
.inst	0xce56b6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[3]
.inst	0xce44bee6	//sm3tt2b v6.4s, v23.4s, v4.4s[3]
	eor	v22.16b, v0.16b, v1.16b
.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
	shl	v21.4s, v20.4s, #1
	sri	v21.4s, v20.4s, #31
.inst	0xce5686e5	//sm3tt1b v5.4s, v23.4s, v22.4s[0]
.inst	0xce408ee6	//sm3tt2b v6.4s, v23.4s, v0.4s[0]
.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
	shl	v20.4s, v21.4s, #1
	sri	v20.4s, v21.4s, #31
.inst	0xce5696e5	//sm3tt1b v5.4s, v23.4s, v22.4s[1]
.inst	0xce409ee6	//sm3tt2b v6.4s, v23.4s, v0.4s[1]
.inst	0xce5418b7	//sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
	shl	v21.4s, v20.4s, #1
	sri	v21.4s, v20.4s, #31
.inst	0xce56a6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[2]
.inst	0xce40aee6	//sm3tt2b v6.4s, v23.4s, v0.4s[2]
.inst	0xce5518b7	//sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
	shl	v20.4s, v21.4s, #1
	sri	v20.4s, v21.4s, #31
.inst	0xce56b6e5	//sm3tt1b v5.4s, v23.4s, v22.4s[3]
.inst	0xce40bee6	//sm3tt2b v6.4s, v23.4s, v0.4s[3]
	eor	v5.16b, v5.16b, v18.16b
	eor	v6.16b, v6.16b, v19.16b

	// any remained blocks?
	cbnz	w2, .Loop

	// save state
	rev64	v5.4s, v5.4s
	rev64	v6.4s, v6.4s
	ext	v5.16b, v5.16b, v5.16b, #8
	ext	v6.16b, v6.16b, v6.16b, #8
	st1	{v5.4s,v6.4s}, [x0]
	ret
.size	ossl_hwsm3_block_data_order,.-ossl_hwsm3_block_data_order

.align	3
.Tj:
.word	0x79cc4519, 0x9d8a7a87
